STA426 Exercise 10: preprocess and explore a single cell data

Author

Gonzalo Cardenal

Published

November 27, 2023

knitr::opts_chunk$set(echo = TRUE)
startTime = proc.time()

The single cell dataset consists of peripheral blood mononuclear cells. These cells comprise different cell types. For each cell the ground truth is provided in terms of the assigned cell type that was derived using additional data. A cell has “unassigned” as cell type if it could not be reliably assigned to a cell type.

  1. Modify the quality-based cell filtering to remove the low quality cells that could not be assigned. Try with different thresholds
  2. Try different clustering methods and parameters such that different cell types are in separate clusters.

The example workflow is inspired by http://bioconductor.org/books/release/OSCA/

suppressPackageStartupMessages(library(scuttle))
suppressPackageStartupMessages(library(scran))
suppressPackageStartupMessages(library(scater))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(HDF5Array))
suppressPackageStartupMessages(library(hdf5r))
suppressPackageStartupMessages(library(pheatmap))
BPPARAM = BiocParallel::MulticoreParam(workers=2)
BPPARAM = BiocParallel::SerialParam()

Question 01: Load the data

sce = readRDS("pbmc-sce.RDS")
sce
class: SingleCellExperiment 
dim: 36601 10194 
metadata(1): Samples
assays(1): counts
rownames(36601): ENSG00000243485 ENSG00000237613 ... ENSG00000278817
  ENSG00000277196
rowData names(3): ID Symbol Type
colnames: NULL
colData names(3): Sample Barcode cellType
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

Optionally subsample 2000 cells to reduce runtime

set.seed(38)
sce = sce[ , sample(1:ncol(sce), size = 2000, replace = FALSE)]

The cell type abundances are

cd = as.data.frame(colData(sce))
ggplot(cd, aes(x=cellType)) + geom_bar() +
   scale_fill_manual() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

Question 2

Compute and visualize quality scores

isMito <- grepl("^mt-", rowData(sce)$Symbol, ignore.case=TRUE)

sce <- addPerCellQC(sce, subsets=list(Mito=isMito), percent_top=100,
                    detection_limit=0, BPPARAM=BPPARAM)
  • number of detected genes
sce_detected <- sce$detected
head(sce_detected)
[1] 4803 2233 4324  343 4143 2332
  • number of assigned reads
sce_assigned <- sce$sum
head(sce_assigned)
[1] 22575  7758 21733   860 15311 10013
  • percentage of mitochondrial reads
sce_percentage_mitochondrial <- sce$subsets_Mito_percent
head(sce_percentage_mitochondrial)
[1]  5.333333  8.842485  6.322183 31.511628  7.909346  7.779886

Visualize the quality scores with plotColData.

#Violin plot
gridExtra::grid.arrange(
plotColData(sce, "detected", "Sample",
       colour_by = "cellType") +  ggtitle("NGSC3 QC scores"),
plotColData(sce, "sum", "Sample",
       colour_by = "cellType") ,
plotColData(sce, "subsets_Mito_percent", "Sample",
       colour_by = "cellType") 
)

#Violin plot
plotColData(sce, "sum", "Sample",
       colour_by = "cellType") 

#Violin plot
plotColData(sce, "subsets_Mito_percent", "Sample",
       colour_by = "cellType") 

plotColData(sce, "detected",  "Sample",
       colour_by = "cellType") 

isMito <- grepl("^mt-", rowData(sce)$Symbol, ignore.case=TRUE)

sce <- addPerCellQC(sce, subsets=list(Mito=isMito), percent_top=100,
                    detection_limit=0, BPPARAM=BPPARAM)

Filter by quality scores. The function isOutlier defines outliers based on the difference to the median scores. Refer to the manual. Consider also setting manual thresholds.

Setting other thresholds

qc.lib <- isOutlier(sce$sum, log=TRUE, type="lower")
qc.nexprs <- isOutlier(sce$detected, log=TRUE, type="lower")
qc.mito <- isOutlier(sce$subsets_Mito_percent, type="higher")
qc.top <- isOutlier(sce$percent.top_100, type="higher")

discard <- qc.lib | qc.nexprs | qc.mito | qc.top

sce$discard = discard

sceFilt = sce[ , !sce$discard]
unassignedFilt <- sceFilt[sceFilt$cellType == "unassigned", ]
f_unassigned <- nrow(unassignedFilt)/nrow(sceFilt)

minCellsExpressed = 5
isExpressed <- Matrix::rowSums(counts(sceFilt) >= 1) >= minCellsExpressed
sceFilt = sceFilt[isExpressed, ]
#Violin plot
gridExtra::grid.arrange(
plotColData(sce, "detected", "Sample",
       colour_by = "discard") +  ggtitle("NGSC3 QC Filtered data"),
plotColData(sce, "sum", "Sample",
       colour_by = "discard") ,
plotColData(sce, "subsets_Mito_percent", "Sample",
       colour_by = "discard") 
)

We can set stricter filtering criteria:

qc.lib2 <- isOutlier(sce$sum, log=TRUE, type="lower", nmads = 2)
qc.nexprs2 <- isOutlier(sce$detected, log=TRUE, type="lower", nmads = 2)
qc.mito2 <- isOutlier(sce$subsets_Mito_percent, type="higher", nmads = 2)
qc.top2 <- isOutlier(sce$percent.top_100, type="higher", nmads = 2)

discard2 <- qc.lib2 | qc.nexprs2 | qc.mito2 | qc.top2

sce$discard2 = discard2

sceFilt2 = sce[ , !sce$discard2]
unassignedFilt_2 <- sceFilt2[sceFilt2$cellType == "unassigned", ]
f_unassigned_2 <- nrow(unassignedFilt_2)/nrow(sceFilt)

minCellsExpressed = 5
isExpressed <- Matrix::rowSums(counts(sceFilt2) >= 1) >= minCellsExpressed
sceFilt2 = sceFilt2[isExpressed, ]

#Violin plots with nMADs = 2 
gridExtra::grid.arrange(
plotColData(sce, "detected", "Sample",
       colour_by = "discard") +  ggtitle("NGSC3 QC Filtered data stricter threshold"),
plotColData(sce, "sum", "Sample",
       colour_by = "discard") ,
plotColData(sce, "subsets_Mito_percent", "Sample",
       colour_by = "discard") 
)

With adaptive tresholds:

df = DataFrame(sum = sce$sum,
               detected = sce$detected,
          mito = sce$subsets_Mito_percent,
          top = sce$percent.top_100)

reasons <- perCellQCFilters(df, 
    sub.fields=c("sum","detected", "mito", "top"), nmads = 1)
colSums(as.matrix(reasons))
  low_lib_size low_n_features       high_sum  high_detected      high_mito 
          1782           1473           2377           2644           2119 
      high_top        discard 
           635           5631 
sce$discard3 = reasons$discard

sceFilt = sce[ , !sce$discard3]
unassignedFilt_3 <- sceFilt[sceFilt$cellType == "unassigned", ]
f_unassigned_3 <- nrow(unassignedFilt_3)/nrow(sceFilt)

minCellsExpressed = 5
isExpressed <- Matrix::rowSums(counts(sceFilt) >= 1) >= minCellsExpressed
sceFilt = sceFilt[isExpressed, ]

#Violin plot
gridExtra::grid.arrange(
plotColData(sce, "detected", "Sample",
       colour_by = "discard") +  ggtitle("NGSC3 QC Filtered data Adaptive Threshold"),
plotColData(sce, "sum", "Sample",
       colour_by = "discard") ,
plotColData(sce, "subsets_Mito_percent", "Sample",
       colour_by = "discard") 
)

Here we run three different filtering methods. The function perCellQCFilters calls isOutlier for each of the subfields specified, it is just a different way to run the filtering. With this specific method when use the stricter deviation to the median allowing only nmads (Median absolute deviation) = 1, narrowing the outliers to a small deviation from the median. The default value of isOutlier takes nmads = 3, so we tried for nmads 3,2 and 1 respectively.As the tutorial mentions, to find a appropriate threshold values for for each experimental protocol and biological system requires considerable experience. Here he just tried different thresholds based on the deviation to the median.

sum(sce$discard3)
[1] 5631
sum(sce$discard2)
[1] 1333
sum(sce$discard)
[1] 891
f_unassigned
[1] 0.03767657
f_unassigned_2
[1] 0.06653696
f_unassigned_3
[1] 0.03617388

We can observe that nmads = 1 got the lowest fraction of unassigned cells 3.62%, however this stricter threshold also discarted 5631 cells, which is not optimal. Our initial criteria(default nmads for isOutlier) with nmads = 3 did a better work filtering, as the fraction of unassigned cells was 3.77% but it only filtered out 891 cells. When selecting this parameter, it is important to find a nice balance between filtering the outliers and still mantein a good number of the data.

Question 3

Normalize

sceFilt = logNormCounts(sceFilt)
sceFilt_2 = logNormCounts(sceFilt)

Compute reduced dimension representation

set.seed(38)
dec <- modelGeneVarByPoisson(sceFilt, BPPARAM=BPPARAM)
topGenes <- getTopHVGs(dec, n = 2000)
sceFilt <- runPCA(sceFilt, subset_row=topGenes, BPPARAM=BPPARAM) 
sceFilt <- runUMAP(sceFilt, dimred="PCA", BPPARAM=BPPARAM)

Clustering

The first clustering is ran with 10 centers for the kmeans

set.seed(100)
clust.kmeans <- kmeans(reducedDim(sceFilt, "PCA"), centers=10)
table(clust.kmeans$cluster)

  1   2   3   4   5   6   7   8   9  10 
445 580 290 268 390 406 879 516 597 192 
colLabels(sceFilt) <- factor(clust.kmeans$cluster)
plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("K=10 labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") +  ggtitle("K=10 cell types colored")

We can ran it for one center per each cell type, i.e. K=27

set.seed(100)
clust.kmeans.27 <- kmeans(reducedDim(sceFilt, "PCA"), centers=27)
table(clust.kmeans.27$cluster)

  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
132 140 126 256 137 133 356 147 119 202  12 343 286  94  78 131 202 404 177 172 
 21  22  23  24  25  26  27 
156 156 115 122 106  86 175 
colLabels(sceFilt) <- factor(clust.kmeans.27$cluster)
plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("K=27 labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") + ggtitle("K=27 cell types colored")

We can also run other clustering methods: Graph-based clustering SNN

g <- buildSNNGraph(sceFilt, use.dimred="PCA")
clust.SNN <- igraph::cluster_walktrap(g)$membership

colLabels(sceFilt) <- factor(clust.SNN)
plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("SNN walktrap labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") + ggtitle("SNN walktrap cell types colored")

Graph-based clustering SNN

g <- buildKNNGraph(sceFilt, use.dimred="PCA")
clust.KNN <- igraph::cluster_walktrap(g)$membership

colLabels(sceFilt) <- factor(clust.KNN)

plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("KNN walktrap labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") + ggtitle("KNN walktrap cell types colored")

Louvain

g <- buildSNNGraph(sceFilt, use.dimred="PCA")
clust.SNN.louvain <- igraph::cluster_louvain(g)$membership

colLabels(sceFilt) <- factor(clust.SNN.louvain)
plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("SNN Louvain labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") + ggtitle("SNN Louvain cell types colored")

Leiden

g <- buildSNNGraph(sceFilt, use.dimred="PCA")
clust.SNN.leiden <- igraph::cluster_leiden(g)$membership

colLabels(sceFilt) <- factor(clust.SNN.leiden)
plotReducedDim(sceFilt, "UMAP", colour_by="label") + ggtitle("SNN Leiden labels colored")

plotReducedDim(sceFilt, "UMAP", colour_by="cellType") + ggtitle("SNN Leiden cell types colored")

Question 4

Compute agreement of clusters with cell types

tab <- table(clust.kmeans$cluster, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="K-Means for K=10 cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

tab <- table(clust.kmeans.27$cluster, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="K-Means for K=27 cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

tab <- table(clust.SNN, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="SNN cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

tab <- table(clust.KNN, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="KNN cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

tab <- table(clust.SNN.louvain, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="SNN louvain cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

tab <- table(clust.SNN.leiden, sceFilt$cellType )
phm <- pheatmap(log10(tab+10), main="SNN leiden cluster vs cell type",
    color=viridis::viridis(100), silent=FALSE)
phm

Sensitivity and specificity of low quality cell detection

mean(sce$discard[sce$cellType == "unassigned"])
[1] 0.5217984
mean(!sce$discard[sce$cellType != "unassigned"])
[1] 0.9463002

Agreement of clustering and cell type

mclust::adjustedRandIndex(clust.kmeans$cluster,sceFilt$cellType)
[1] 0.5119036

To evaluate the clusters quality, we can just compute the metric adjusted rand index. For our other clustering methods:

mclust::adjustedRandIndex(clust.kmeans.27$cluster,sceFilt$cellType)
[1] 0.3106225
mclust::adjustedRandIndex(clust.SNN,sceFilt$cellType)
[1] 0.6167082
mclust::adjustedRandIndex(clust.KNN,sceFilt$cellType)
[1] 0.5969283
mclust::adjustedRandIndex(clust.SNN.louvain,sceFilt$cellType)
[1] 0.6492335
mclust::adjustedRandIndex(clust.SNN.leiden,sceFilt$cellType)
[1] 0.254534

The adjusted Rand Index measures the similarity between two classifications of the same objects by the proportions of agreements between the two partitions. By comparing to sceFilt$cellType we can estimate how the clustering methods performed, from a ratio from 0 to 1. The clustering method with the highest adjusted Rand index was SNN.louvain, and therefore we can conclude SNN.louvain was the best method to cluster our data.

Runtime in seconds

paste(signif((proc.time() - startTime)[ "elapsed"], digits=4), "s")
[1] "40.38 s"